-
Notifications
You must be signed in to change notification settings - Fork 49
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Setup_audio #322
Setup_audio #322
Changes from 49 commits
ade26eb
5faaf02
b84be69
82b0217
506bd0d
9861395
50d78ac
369621d
5723b3f
4b7150e
fafe57d
30d2aca
5f23744
56a86e8
babbbc2
8cd0e89
7d22310
363bac6
4e5f54e
081c129
5a88008
154ef14
28116d8
f42626f
7f94125
7706916
7fdc4b8
e4011ea
8be8b76
dcc8e01
e9121b2
8924b08
2e24289
5a441e8
12a0fd2
578127e
bec4b4f
2dd38a7
ffb126d
ca6db64
24ffa9f
4d48a9a
6777a07
b70229e
5c0eaed
2a69efd
ab23572
cf043c8
fd5393a
4c40d91
230d1fd
ba72b11
0177150
9f2d083
c5d38e2
b587614
1834c49
7611a5a
c66524b
a777c7f
8b9b511
b34ede7
c0ce931
63603e4
d212703
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
{ | ||
"@context": { | ||
"@language": "en", | ||
"@vocab": "https://schema.org/", | ||
"column": "ml:column", | ||
"conformsTo": "dct:conformsTo", | ||
"data": { | ||
"@id": "ml:data", | ||
"@type": "@json" | ||
}, | ||
"dataBiases": "ml:dataBiases", | ||
"dataCollection": "ml:dataCollection", | ||
"dataType": { | ||
"@id": "ml:dataType", | ||
"@type": "@vocab" | ||
}, | ||
"dct": "http://purl.org/dc/terms/", | ||
"extract": "ml:extract", | ||
"field": "ml:field", | ||
"fileProperty": "ml:fileProperty", | ||
"format": "ml:format", | ||
"includes": "ml:includes", | ||
"isEnumeration": "ml:isEnumeration", | ||
"jsonPath": "ml:jsonPath", | ||
"ml": "http://mlcommons.org/schema/", | ||
"parentField": "ml:parentField", | ||
"path": "ml:path", | ||
"personalSensitiveInformation": "ml:personalSensitiveInformation", | ||
"recordSet": "ml:recordSet", | ||
"references": "ml:references", | ||
"regex": "ml:regex", | ||
"repeated": "ml:repeated", | ||
"replace": "ml:replace", | ||
"sc": "https://schema.org/", | ||
"separator": "ml:separator", | ||
"source": "ml:source", | ||
"subField": "ml:subField", | ||
"transform": "ml:transform", | ||
"wd": "https://www.wikidata.org/wiki/" | ||
}, | ||
"@type": "sc:Dataset", | ||
"name": "audio_test", | ||
"description": "This is the basic test case for audio files", | ||
"conformsTo": "http://mlcommons.org/croissant/1.0", | ||
"url": "None", | ||
"distribution": [ | ||
{ | ||
"@type": "sc:FileSet", | ||
"name": "files", | ||
"encodingFormat": "audio/mpeg", | ||
"includes": "data/*.mp3" | ||
} | ||
], | ||
"recordSet": [ | ||
{ | ||
"@type": "ml:RecordSet", | ||
"name": "records", | ||
"description": "These are the records.", | ||
"field": [ | ||
{ | ||
"@type": "ml:Field", | ||
"name": "audio", | ||
"description": "These are the sounds.", | ||
"dataType": "sc:AudioObject", | ||
"source": { | ||
"distribution": "files", | ||
"extract": { | ||
"fileProperty": "content" | ||
} | ||
} | ||
} | ||
] | ||
} | ||
] | ||
} |
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,5 +86,10 @@ def PIL_Image(cls) -> types.ModuleType: # pylint: disable=invalid-name | |
"""Cached git module.""" | ||
return _try_import("PIL.Image", package_name="Pillow") | ||
|
||
@cached_class_property | ||
def LIB_Audio(cls) -> types.ModuleType: # pylint: disable=invalid-name | ||
"""Cached git module.""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
return _try_import("librosa", package_name="librosa") | ||
|
||
|
||
deps = OptionalDependencies |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -127,6 +127,11 @@ def test_hermetic_loading(dataset_name, record_set_name, num_records): | |
["huggingface-c4/metadata.json", "en", 1], | ||
["huggingface-mnist/metadata.json", "default", 10], | ||
["titanic/metadata.json", "passengers", -1], | ||
[ | ||
"audio_test/metadata.json", | ||
"records", | ||
-1, | ||
], # Switch the number to 10 if nessacary | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. necessary There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove the comment? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just removed it. |
||
], | ||
) | ||
def test_nonhermetic_loading(dataset_name, record_set_name, num_records): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,9 @@ def _cast_value(value: Any, data_type: type | term.URIRef | None): | |
return deps.PIL_Image.open(io.BytesIO(value)) | ||
else: | ||
raise ValueError(f"Type {type(value)} is not accepted for an image.") | ||
elif data_type == DataType.AUDIO_OBJECT: | ||
output = deps.LIB_Audio.load(io.BytesIO(value)) | ||
return str([output[0].tolist(), output[1]]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why a str? How will you use this in an ML pipeline? What would be the most useful signal to return here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did it like that because the output needs to have double quotes. I'm about to push a version that just outputs the regular librosa output, but I don't know if it will pass the test. |
||
elif data_type == DataType.BOUNDING_BOX: | ||
return bounding_box.parse(value) | ||
elif not isinstance(data_type, type): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ version = "0.0.5" | |
authors = [ | ||
{ name = "Joaquin Vanschoren" }, | ||
{ name = "Jos van der Velde" }, | ||
{ name = "Monjish Bhattacharyya" }, | ||
{ name = "Omar Benjelloun" }, | ||
{ name = "Peter Mattson" }, | ||
{ name = "Pieter Gijsbers" }, | ||
|
@@ -18,6 +19,7 @@ authors = [ | |
# pip dependencies of the project | ||
# Installed locally with `pip install -e .` | ||
dependencies = [ | ||
"black[jupyter]", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove jupyter? |
||
"absl-py", | ||
"etils[epath]", | ||
"jsonpath-rw", | ||
|
@@ -27,7 +29,7 @@ dependencies = [ | |
"python-dateutil", | ||
"rdflib", | ||
"requests", | ||
"tqdm", | ||
"tqdm" | ||
] | ||
readme = "README.md" | ||
|
||
|
@@ -38,6 +40,7 @@ dev = [ | |
"black==23.11.0", | ||
"datasets", | ||
"flake8-docstrings", | ||
"mlcroissant[audio]", | ||
"mlcroissant[git]", | ||
"mlcroissant[image]", | ||
"mlcroissant[parquet]", | ||
|
@@ -48,6 +51,7 @@ dev = [ | |
"pytest", | ||
"pytype", | ||
] | ||
audio = ["librosa"] | ||
git = ["GitPython"] | ||
image = ["Pillow"] | ||
parquet = ["pyarrow"] | ||
|
@@ -79,9 +83,10 @@ module = [ | |
"datasets", | ||
"etils.*", | ||
"jsonpath_rw", | ||
"librosa", | ||
"networkx", | ||
"pandas", | ||
"pillow", | ||
"pillow" | ||
] | ||
ignore_missing_imports = true | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
def librosa
(PIL.Image above is a library, so the name here should probably be librosa)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just changed the name, and the usage in field.py