Skip to content

Commit

Permalink
Setup_audio (#322)
Browse files Browse the repository at this point in the history
Croissant can now recognize sc:AudioObject types and return output.
  • Loading branch information
monke6942021 authored Feb 8, 2024
1 parent 48938fb commit 4f2e86f
Show file tree
Hide file tree
Showing 15 changed files with 196 additions and 1 deletion.
Binary file added datasets/0.8/audio_test/data/Clap.mp3
Binary file not shown.
Binary file added datasets/0.8/audio_test/data/Snap.mp3
Binary file not shown.
74 changes: 74 additions & 0 deletions datasets/0.8/audio_test/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
{
"@context": {
"@language": "en",
"@vocab": "https://schema.org/",
"column": "ml:column",
"data": {
"@id": "ml:data",
"@type": "@json"
},
"dataBiases": "ml:dataBiases",
"dataCollection": "ml:dataCollection",
"dataType": {
"@id": "ml:dataType",
"@type": "@vocab"
},
"dct": "http://purl.org/dc/terms/",
"extract": "ml:extract",
"field": "ml:field",
"fileProperty": "ml:fileProperty",
"format": "ml:format",
"includes": "ml:includes",
"isEnumeration": "ml:isEnumeration",
"jsonPath": "ml:jsonPath",
"ml": "http://mlcommons.org/schema/",
"parentField": "ml:parentField",
"path": "ml:path",
"personalSensitiveInformation": "ml:personalSensitiveInformation",
"recordSet": "ml:recordSet",
"references": "ml:references",
"regex": "ml:regex",
"repeated": "ml:repeated",
"replace": "ml:replace",
"sc": "https://schema.org/",
"separator": "ml:separator",
"source": "ml:source",
"subField": "ml:subField",
"transform": "ml:transform",
"wd": "https://www.wikidata.org/wiki/"
},
"@type": "sc:Dataset",
"name": "audio_test",
"description": "This is the basic test case for audio files",
"conformsTo": "http://mlcommons.org/croissant/1.0",
"url": "None",
"distribution": [
{
"@type": "sc:FileSet",
"name": "files",
"encodingFormat": "audio/mpeg",
"includes": "data/*.mp3"
}
],
"recordSet": [
{
"@type": "ml:RecordSet",
"name": "records",
"description": "These are the records.",
"field": [
{
"@type": "ml:Field",
"name": "audio",
"description": "These are the sounds.",
"dataType": "sc:AudioObject",
"source": {
"distribution": "files",
"extract": {
"fileProperty": "content"
}
}
}
]
}
]
}
2 changes: 2 additions & 0 deletions datasets/0.8/audio_test/output/records.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"audio": "(array([-2.8619270e-13, -1.7014803e-13, 2.7065091e-14, ...,\n -6.4091455e-06, -3.7976279e-06, 2.7510678e-06], dtype=float32), 22050)"}
{"audio": "(array([5.8726583e-14, 1.3397688e-13, 2.2199205e-13, ..., 4.2678180e-04,\n 1.9029720e-04, 2.7079385e-04], dtype=float32), 22050)"}
Binary file added datasets/1.0/audio_test/data/Clap.mp3
Binary file not shown.
Binary file added datasets/1.0/audio_test/data/Snap.mp3
Binary file not shown.
81 changes: 81 additions & 0 deletions datasets/1.0/audio_test/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
{
"@context": {
"@language": "en",
"@vocab": "https://schema.org/",
"citeAs": "cr:citeAs",
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"cr": "http://mlcommons.org/croissant/",
"data": {
"@id": "cr:data",
"@type": "@json"
},
"dataBiases": "cr:dataBiases",
"dataCollection": "cr:dataCollection",
"dataType": {
"@id": "cr:dataType",
"@type": "@vocab"
},
"dct": "http://purl.org/dc/terms/",
"extract": "cr:extract",
"field": "cr:field",
"fileProperty": "cr:fileProperty",
"fileObject": "cr:fileObject",
"fileSet": "cr:fileSet",
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
"parentField": "cr:parentField",
"path": "cr:path",
"personalSensitiveInformation": "cr:personalSensitiveInformation",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
"repeated": "cr:repeated",
"replace": "cr:replace",
"sc": "https://schema.org/",
"separator": "cr:separator",
"source": "cr:source",
"subField": "cr:subField",
"transform": "cr:transform",
"wd": "https://www.wikidata.org/wiki/"
},
"@type": "sc:Dataset",
"name": "audio_test",
"description": "This is the basic test case for audio files",
"conformsTo": "http://mlcommons.org/croissant/1.0",
"url": "None",
"distribution": [
{
"@type": "cr:FileSet",
"name": "files",
"encodingFormat": "audio/mpeg",
"includes": "data/*.mp3"
}
],
"recordSet": [
{
"@type": "cr:RecordSet",
"name": "records",
"description": "These are the records.",
"field": [
{
"@type": "cr:Field",
"name": "audio",
"description": "These are the sounds.",
"dataType": "sc:AudioObject",
"source": {
"cr:fileSet": "files",
"extract": {
"fileProperty": "content"
}
}
}
]
}
]
}
2 changes: 2 additions & 0 deletions datasets/1.0/audio_test/output/records.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"audio": "(array([-2.8619270e-13, -1.7014803e-13, 2.7065091e-14, ...,\n -6.4091455e-06, -3.7976279e-06, 2.7510678e-06], dtype=float32), 22050)"}
{"audio": "(array([5.8726583e-14, 1.3397688e-13, 2.2199205e-13, ..., 4.2678180e-04,\n 1.9029720e-04, 2.7079385e-04], dtype=float32), 22050)"}
11 changes: 11 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,14 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
SCHEMA_ORG_CREATOR = namespace.SDO.creator
SCHEMA_ORG_DATE_PUBLISHED = namespace.SDO.datePublished
SCHEMA_ORG_DATASET = namespace.SDO.Dataset
SCHEMA_ORG_DATA_TYPE_AUDIO_OBJECT = namespace.SDO.AudioObject
SCHEMA_ORG_DATA_TYPE_BOOL = namespace.SDO.Boolean
SCHEMA_ORG_DATA_TYPE_DATE = namespace.SDO.Date
SCHEMA_ORG_DATA_TYPE_FLOAT = namespace.SDO.Float
SCHEMA_ORG_DATA_TYPE_IMAGE_OBJECT = namespace.SDO.ImageObject
SCHEMA_ORG_DATA_TYPE_INTEGER = namespace.SDO.Integer
SCHEMA_ORG_DATA_TYPE_TEXT = namespace.SDO.Text
SCHEMA_ORG_DATA_TYPE_URL = namespace.SDO.URL
SCHEMA_ORG_DESCRIPTION = namespace.SDO.description
SCHEMA_ORG_DISTRIBUTION = namespace.SDO.distribution
SCHEMA_ORG_EMAIL = namespace.SDO.email
Expand Down Expand Up @@ -154,8 +162,10 @@ class EncodingFormat:

CSV = "text/csv"
GIT = "git+https"
JPG = "image/jpeg"
JSON = "application/json"
JSON_LINES = "application/jsonlines"
MP3 = "audio/mpeg"
PARQUET = "application/x-parquet"
TEXT = "text/plain"
TSV = "text/tsv"
Expand All @@ -166,6 +176,7 @@ class EncodingFormat:
class DataType:
"""Data types supported by Croissant."""

AUDIO_OBJECT = namespace.SDO.AudioObject
BOOL = namespace.SDO.Boolean
BOUNDING_BOX = lambda ctx: ML_COMMONS(ctx).BoundingBox
DATE = namespace.SDO.Date
Expand Down
5 changes: 5 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/optional.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ def PIL_Image(cls) -> types.ModuleType: # pylint: disable=invalid-name
"""Cached PIL module."""
return _try_import("PIL.Image", package_name="Pillow")

@cached_class_property
def librosa(cls) -> types.ModuleType: # pylint: disable=invalid-name
"""Cached librosa module."""
return _try_import("librosa", package_name="librosa")

@cached_class_property
def torchdata_datapipes(cls) -> types.ModuleType:
"""Cached torchdata module."""
Expand Down
5 changes: 5 additions & 0 deletions python/mlcroissant/mlcroissant/_src/datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,11 @@ def test_hermetic_loading(version, dataset_name, record_set_name, num_records):
["huggingface-c4/metadata.json", "en", 1],
["huggingface-mnist/metadata.json", "default", 10],
["titanic/metadata.json", "passengers", -1],
[
"audio_test/metadata.json",
"records",
10,
],
],
)
def test_nonhermetic_loading(version, dataset_name, record_set_name, num_records):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ def _cast_value(ctx: Context, value: Any, data_type: type | term.URIRef | None):
return deps.PIL_Image.open(io.BytesIO(value))
else:
raise ValueError(f"Type {type(value)} is not accepted for an image.")
elif data_type == DataType.AUDIO_OBJECT:
output = deps.librosa.load(io.BytesIO(value))
return output
elif data_type == DataType.BOUNDING_BOX(ctx): # pytype: disable=wrong-arg-types
return bounding_box.parse(value)
elif not isinstance(data_type, type):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,13 @@ def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame:
return pd.DataFrame({
FileProperty.content: [file.read()],
})
elif (
encoding_format == EncodingFormat.MP3
or encoding_format == EncodingFormat.JPG
):
return pd.DataFrame({
FileProperty.content: [file.read()],
})
else:
raise ValueError(
f"Unsupported encoding format for file: {encoding_format}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def data_type(self) -> type | term.URIRef | None:
DataType.IMAGE_OBJECT,
# For some reasons, pytype cannot infer `Any` on ctx:
DataType.BOUNDING_BOX(self.ctx), # pytype: disable=wrong-arg-types
DataType.AUDIO_OBJECT,
]:
return term.URIRef(data_type)
# The data_type has to be found on a predecessor:
Expand Down
6 changes: 5 additions & 1 deletion python/mlcroissant/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ version = "0.8.2"
authors = [
{ name = "Joaquin Vanschoren" },
{ name = "Jos van der Velde" },
{ name = "Monjish Bhattacharyya" },
{ name = "Omar Benjelloun" },
{ name = "Peter Mattson" },
{ name = "Pieter Gijsbers" },
Expand All @@ -27,7 +28,7 @@ dependencies = [
"python-dateutil",
"rdflib",
"requests",
"tqdm",
"tqdm"
]
readme = "README.md"

Expand All @@ -38,6 +39,7 @@ dev = [
"black==23.11.0",
"datasets",
"flake8-docstrings",
"mlcroissant[audio]",
"mlcroissant[git]",
"mlcroissant[image]",
"mlcroissant[parquet]",
Expand All @@ -49,6 +51,7 @@ dev = [
"pytype",
"torchdata",
]
audio = ["librosa"]
git = ["GitPython"]
image = ["Pillow"]
parquet = ["pyarrow"]
Expand Down Expand Up @@ -80,6 +83,7 @@ module = [
"datasets",
"etils.*",
"jsonpath_rw",
"librosa",
"networkx",
"pandas",
"pillow",
Expand Down

0 comments on commit 4f2e86f

Please sign in to comment.