Setup_audio (#322)

Croissant can now recognize sc:AudioObject types and return output.
mlcommons · Feb 8, 2024 · 4f2e86f · 4f2e86f
1 parent 48938fb
commit 4f2e86f
Show file tree

Hide file tree

Showing 15 changed files with 196 additions and 1 deletion.
diff --git a/datasets/0.8/audio_test/data/Clap.mp3 b/datasets/0.8/audio_test/data/Clap.mp3
diff --git a/datasets/0.8/audio_test/data/Snap.mp3 b/datasets/0.8/audio_test/data/Snap.mp3
diff --git a/datasets/0.8/audio_test/metadata.json b/datasets/0.8/audio_test/metadata.json
@@ -0,0 +1,74 @@
+{
+  "@context": {
+    "@language": "en",
+    "@vocab": "https://schema.org/",
+    "column": "ml:column",
+    "data": {
+      "@id": "ml:data",
+      "@type": "@json"
+    },
+    "dataBiases": "ml:dataBiases",
+    "dataCollection": "ml:dataCollection",
+    "dataType": {
+      "@id": "ml:dataType",
+      "@type": "@vocab"
+    },
+    "dct": "http://purl.org/dc/terms/",
+    "extract": "ml:extract",
+    "field": "ml:field",
+    "fileProperty": "ml:fileProperty",
+    "format": "ml:format",
+    "includes": "ml:includes",
+    "isEnumeration": "ml:isEnumeration",
+    "jsonPath": "ml:jsonPath",
+    "ml": "http://mlcommons.org/schema/",
+    "parentField": "ml:parentField",
+    "path": "ml:path",
+    "personalSensitiveInformation": "ml:personalSensitiveInformation",
+    "recordSet": "ml:recordSet",
+    "references": "ml:references",
+    "regex": "ml:regex",
+    "repeated": "ml:repeated",
+    "replace": "ml:replace",
+    "sc": "https://schema.org/",
+    "separator": "ml:separator",
+    "source": "ml:source",
+    "subField": "ml:subField",
+    "transform": "ml:transform",
+    "wd": "https://www.wikidata.org/wiki/"
+  },
+  "@type": "sc:Dataset",
+  "name": "audio_test",
+  "description": "This is the basic test case for audio files",
+  "conformsTo": "http://mlcommons.org/croissant/1.0",
+  "url": "None",
+  "distribution": [
+    {
+      "@type": "sc:FileSet",
+      "name": "files",
+      "encodingFormat": "audio/mpeg",
+      "includes": "data/*.mp3"
+    }
+  ],
+  "recordSet": [
+    {
+      "@type": "ml:RecordSet",
+      "name": "records",
+      "description": "These are the records.",
+      "field": [
+        {
+          "@type": "ml:Field",
+          "name": "audio",
+          "description": "These are the sounds.",
+          "dataType": "sc:AudioObject",
+          "source": {
+            "distribution": "files",
+            "extract": {
+              "fileProperty": "content"
+            }
+          }
+        }
+      ]
+    }
+  ]
+}
diff --git a/datasets/0.8/audio_test/output/records.jsonl b/datasets/0.8/audio_test/output/records.jsonl
@@ -0,0 +1,2 @@
+{"audio": "(array([-2.8619270e-13, -1.7014803e-13,  2.7065091e-14, ...,\n       -6.4091455e-06, -3.7976279e-06,  2.7510678e-06], dtype=float32), 22050)"}
+{"audio": "(array([5.8726583e-14, 1.3397688e-13, 2.2199205e-13, ..., 4.2678180e-04,\n       1.9029720e-04, 2.7079385e-04], dtype=float32), 22050)"}
diff --git a/datasets/1.0/audio_test/data/Clap.mp3 b/datasets/1.0/audio_test/data/Clap.mp3
diff --git a/datasets/1.0/audio_test/data/Snap.mp3 b/datasets/1.0/audio_test/data/Snap.mp3
diff --git a/datasets/1.0/audio_test/metadata.json b/datasets/1.0/audio_test/metadata.json
@@ -0,0 +1,81 @@
+{
+  "@context": {
+    "@language": "en",
+    "@vocab": "https://schema.org/",
+    "citeAs": "cr:citeAs",
+    "column": "cr:column",
+    "conformsTo": "dct:conformsTo",
+    "cr": "http://mlcommons.org/croissant/",
+    "data": {
+      "@id": "cr:data",
+      "@type": "@json"
+    },
+    "dataBiases": "cr:dataBiases",
+    "dataCollection": "cr:dataCollection",
+    "dataType": {
+      "@id": "cr:dataType",
+      "@type": "@vocab"
+    },
+    "dct": "http://purl.org/dc/terms/",
+    "extract": "cr:extract",
+    "field": "cr:field",
+    "fileProperty": "cr:fileProperty",
+    "fileObject": "cr:fileObject",
+    "fileSet": "cr:fileSet",
+    "format": "cr:format",
+    "includes": "cr:includes",
+    "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
+    "jsonPath": "cr:jsonPath",
+    "key": "cr:key",
+    "md5": "cr:md5",
+    "parentField": "cr:parentField",
+    "path": "cr:path",
+    "personalSensitiveInformation": "cr:personalSensitiveInformation",
+    "recordSet": "cr:recordSet",
+    "references": "cr:references",
+    "regex": "cr:regex",
+    "repeated": "cr:repeated",
+    "replace": "cr:replace",
+    "sc": "https://schema.org/",
+    "separator": "cr:separator",
+    "source": "cr:source",
+    "subField": "cr:subField",
+    "transform": "cr:transform",
+    "wd": "https://www.wikidata.org/wiki/"
+  },
+  "@type": "sc:Dataset",
+  "name": "audio_test",
+  "description": "This is the basic test case for audio files",
+  "conformsTo": "http://mlcommons.org/croissant/1.0",
+  "url": "None",
+  "distribution": [
+    {
+      "@type": "cr:FileSet",
+      "name": "files",
+      "encodingFormat": "audio/mpeg",
+      "includes": "data/*.mp3"
+    }
+  ],
+  "recordSet": [
+    {
+      "@type": "cr:RecordSet",
+      "name": "records",
+      "description": "These are the records.",
+      "field": [
+        {
+          "@type": "cr:Field",
+          "name": "audio",
+          "description": "These are the sounds.",
+          "dataType": "sc:AudioObject",
+          "source": {
+            "cr:fileSet": "files",
+            "extract": {
+              "fileProperty": "content"
+            }
+          }
+        }
+      ]
+    }
+  ]
+}
diff --git a/datasets/1.0/audio_test/output/records.jsonl b/datasets/1.0/audio_test/output/records.jsonl
@@ -0,0 +1,2 @@
+{"audio": "(array([-2.8619270e-13, -1.7014803e-13,  2.7065091e-14, ...,\n       -6.4091455e-06, -3.7976279e-06,  2.7510678e-06], dtype=float32), 22050)"}
+{"audio": "(array([5.8726583e-14, 1.3397688e-13, 2.2199205e-13, ..., 4.2678180e-04,\n       1.9029720e-04, 2.7079385e-04], dtype=float32), 22050)"}
diff --git a/python/mlcroissant/mlcroissant/_src/core/constants.py b/python/mlcroissant/mlcroissant/_src/core/constants.py
@@ -76,6 +76,14 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
 SCHEMA_ORG_CREATOR = namespace.SDO.creator
 SCHEMA_ORG_DATE_PUBLISHED = namespace.SDO.datePublished
 SCHEMA_ORG_DATASET = namespace.SDO.Dataset
+SCHEMA_ORG_DATA_TYPE_AUDIO_OBJECT = namespace.SDO.AudioObject
+SCHEMA_ORG_DATA_TYPE_BOOL = namespace.SDO.Boolean
+SCHEMA_ORG_DATA_TYPE_DATE = namespace.SDO.Date
+SCHEMA_ORG_DATA_TYPE_FLOAT = namespace.SDO.Float
+SCHEMA_ORG_DATA_TYPE_IMAGE_OBJECT = namespace.SDO.ImageObject
+SCHEMA_ORG_DATA_TYPE_INTEGER = namespace.SDO.Integer
+SCHEMA_ORG_DATA_TYPE_TEXT = namespace.SDO.Text
+SCHEMA_ORG_DATA_TYPE_URL = namespace.SDO.URL
 SCHEMA_ORG_DESCRIPTION = namespace.SDO.description
 SCHEMA_ORG_DISTRIBUTION = namespace.SDO.distribution
 SCHEMA_ORG_EMAIL = namespace.SDO.email
@@ -154,8 +162,10 @@ class EncodingFormat:
 
     CSV = "text/csv"
     GIT = "git+https"
+    JPG = "image/jpeg"
     JSON = "application/json"
     JSON_LINES = "application/jsonlines"
+    MP3 = "audio/mpeg"
     PARQUET = "application/x-parquet"
     TEXT = "text/plain"
     TSV = "text/tsv"
@@ -166,6 +176,7 @@ class EncodingFormat:
 class DataType:
     """Data types supported by Croissant."""
 
+    AUDIO_OBJECT = namespace.SDO.AudioObject
     BOOL = namespace.SDO.Boolean
     BOUNDING_BOX = lambda ctx: ML_COMMONS(ctx).BoundingBox
     DATE = namespace.SDO.Date

diff --git a/python/mlcroissant/mlcroissant/_src/core/optional.py b/python/mlcroissant/mlcroissant/_src/core/optional.py
@@ -86,6 +86,11 @@ def PIL_Image(cls) -> types.ModuleType:  # pylint: disable=invalid-name
         """Cached PIL module."""
         return _try_import("PIL.Image", package_name="Pillow")
 
+    @cached_class_property
+    def librosa(cls) -> types.ModuleType:  # pylint: disable=invalid-name
+        """Cached librosa module."""
+        return _try_import("librosa", package_name="librosa")
+
     @cached_class_property
     def torchdata_datapipes(cls) -> types.ModuleType:
         """Cached torchdata module."""

diff --git a/python/mlcroissant/mlcroissant/_src/datasets_test.py b/python/mlcroissant/mlcroissant/_src/datasets_test.py
@@ -135,6 +135,11 @@ def test_hermetic_loading(version, dataset_name, record_set_name, num_records):
         ["huggingface-c4/metadata.json", "en", 1],
         ["huggingface-mnist/metadata.json", "default", 10],
         ["titanic/metadata.json", "passengers", -1],
+        [
+            "audio_test/metadata.json",
+            "records",
+            10,
+        ],
     ],
 )
 def test_nonhermetic_loading(version, dataset_name, record_set_name, num_records):

diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
@@ -75,6 +75,9 @@ def _cast_value(ctx: Context, value: Any, data_type: type | term.URIRef | None):
             return deps.PIL_Image.open(io.BytesIO(value))
         else:
             raise ValueError(f"Type {type(value)} is not accepted for an image.")
+    elif data_type == DataType.AUDIO_OBJECT:
+        output = deps.librosa.load(io.BytesIO(value))
+        return output
     elif data_type == DataType.BOUNDING_BOX(ctx):  # pytype: disable=wrong-arg-types
         return bounding_box.parse(value)
     elif not isinstance(data_type, type):

diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py
@@ -120,6 +120,13 @@ def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame:
                     return pd.DataFrame({
                         FileProperty.content: [file.read()],
                     })
+            elif (
+                encoding_format == EncodingFormat.MP3
+                or encoding_format == EncodingFormat.JPG
+            ):
+                return pd.DataFrame({
+                    FileProperty.content: [file.read()],
+                })
             else:
                 raise ValueError(
                     f"Unsupported encoding format for file: {encoding_format}"

diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py
@@ -99,6 +99,7 @@ def data_type(self) -> type | term.URIRef | None:
                     DataType.IMAGE_OBJECT,
                     # For some reasons, pytype cannot infer `Any` on ctx:
                     DataType.BOUNDING_BOX(self.ctx),  # pytype: disable=wrong-arg-types
+                    DataType.AUDIO_OBJECT,
                 ]:
                     return term.URIRef(data_type)
         # The data_type has to be found on a predecessor:

diff --git a/python/mlcroissant/pyproject.toml b/python/mlcroissant/pyproject.toml
@@ -7,6 +7,7 @@ version = "0.8.2"
 authors = [
   { name = "Joaquin Vanschoren" },
   { name = "Jos van der Velde" },
+  { name = "Monjish Bhattacharyya" },
   { name = "Omar Benjelloun" },
   { name = "Peter Mattson" },
   { name = "Pieter Gijsbers" },
@@ -27,7 +28,7 @@ dependencies = [
   "python-dateutil",
   "rdflib",
   "requests",
-  "tqdm",
+  "tqdm"
 ]
 readme = "README.md"
 
@@ -38,6 +39,7 @@ dev = [
   "black==23.11.0",
   "datasets",
   "flake8-docstrings",
+  "mlcroissant[audio]",
   "mlcroissant[git]",
   "mlcroissant[image]",
   "mlcroissant[parquet]",
@@ -49,6 +51,7 @@ dev = [
   "pytype",
   "torchdata",
 ]
+audio = ["librosa"]
 git = ["GitPython"]
 image = ["Pillow"]
 parquet = ["pyarrow"]
@@ -80,6 +83,7 @@ module = [
   "datasets",
   "etils.*",
   "jsonpath_rw",
+  "librosa",
   "networkx",
   "pandas",
   "pillow",
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"audio": "(array([-2.8619270e-13, -1.7014803e-13, 2.7065091e-14, ...,\n -6.4091455e-06, -3.7976279e-06, 2.7510678e-06], dtype=float32), 22050)"}
		{"audio": "(array([5.8726583e-14, 1.3397688e-13, 2.2199205e-13, ..., 4.2678180e-04,\n 1.9029720e-04, 2.7079385e-04], dtype=float32), 22050)"}